1 Introduction

This page will introduce the use of Neotoma APIs and describe some situations when they might be preferable to the use of the Neotoma2 R package.

2 What is an API?

2.1 The R package and the API

3 Call time comparison

lats = c(43, 50, 50, 43)
lons= c(-65, -65, -60, -60) 

coordinates = data.frame(lat = lats, lon = lons)

coordinates_sf = coordinates %>%
  st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
  summarise(geometry = st_combine(geometry)) %>%
  st_cast("POLYGON")

bbox_geojson = sf_geojson(coordinates_sf)

Rsites = neotoma2::get_sites(loc = bbox_geojson, all_data = TRUE)

R_getsites_time = system.time(neotoma2::get_sites(loc = bbox_geojson, all_data = TRUE))

api_sites = content(GET(paste0("https://api.neotomadb.org/v2.0/data/sites?loc=",bbox_geojson,"&limit=9999&offset=0")))$data

api_getsites_time = system.time(content(GET(paste0("https://api.neotomadb.org/v2.0/data/sites?loc=",bbox_geojson,"&limit=99999&offset=0")))$data)

print(R_getsites_time)
##    user  system elapsed 
##    1.04    0.05    2.31
print(api_getsites_time)
##    user  system elapsed 
##    0.03    0.00    0.75
print(length(api_sites))
## [1] 153
lats1 = c(43, 50, 50, 43)
lons1= c(-70, -70, -60, -60) 

coordinates1 = data.frame(lat = lats1, lon = lons1)

coordinates1_sf = coordinates1 %>%
  st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
  summarise(geometry = st_combine(geometry)) %>%
  st_cast("POLYGON")

bbox_geojson1 = sf_geojson(coordinates1_sf)

Rsites1 = neotoma2::get_sites(loc = bbox_geojson1, all_data = TRUE)

R_getsites_time1 = system.time(neotoma2::get_sites(loc = bbox_geojson1, all_data = TRUE))

api_sites1 = content(GET(paste0("https://api.neotomadb.org/v2.0/data/sites?loc=",bbox_geojson1,"&limit=9999&offset=0")))$data

api_getsites_time1 = system.time(content(GET(paste0("https://api.neotomadb.org/v2.0/data/sites?loc=",bbox_geojson1,"&limit=99999&offset=0")))$data)


print(R_getsites_time1)
##    user  system elapsed 
##    3.14    0.13    7.53
print(api_getsites_time1)
##    user  system elapsed 
##    0.02    0.00    1.36
print(length(api_sites1))
## [1] 479
lats2 = c(33, 50, 50, 33)
lons2 = c(-75, -75, -60, -60)

coordinates2 = data.frame(lat = lats2, lon = lons2)

coordinates2_sf = coordinates2 %>%
  st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
  summarise(geometry = st_combine(geometry)) %>%
  st_cast("POLYGON")

bbox_geojson2 = sf_geojson(coordinates2_sf)

Rsites2 = neotoma2::get_sites(loc = bbox_geojson2, all_data = TRUE)

R_getsites_time2 = system.time(neotoma2::get_sites(loc = bbox_geojson2, all_data = TRUE))

api_sites2 = content(GET(paste0("https://api.neotomadb.org/v2.0/data/sites?loc=",bbox_geojson2,"&limit=9999&offset=0")))$data

api_getsites_time2 = system.time(content(GET(paste0("https://api.neotomadb.org/v2.0/data/sites?loc=",bbox_geojson2,"&limit=99999&offset=0")))$data)


print(R_getsites_time2)
##    user  system elapsed 
##   11.03    0.31   25.99
print(api_getsites_time2)
##    user  system elapsed 
##    0.20    0.00    2.16
print(length(api_sites2))
## [1] 1664
lats3 = c(23, 50, 50, 23)
lons3 = c(-80, -80, -60, -60) 

coordinates3 = data.frame(lat = lats3, lon = lons3)

coordinates3_sf = coordinates3 %>%
  st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
  summarise(geometry = st_combine(geometry)) %>%
  st_cast("POLYGON")

bbox_geojson3 = sf_geojson(coordinates3_sf)

Rsites3 = neotoma2::get_sites(loc = bbox_geojson3, all_data = TRUE)

R_getsites_time3 = system.time(neotoma2::get_sites(loc = bbox_geojson3, all_data = TRUE))

api_sites3 = content(GET(paste0("https://api.neotomadb.org/v2.0/data/sites?loc=",bbox_geojson3,"&limit=9999&offset=0")))$data

api_getsites_time3 = system.time(content(GET(paste0("https://api.neotomadb.org/v2.0/data/sites?loc=",bbox_geojson3,"&limit=99999&offset=0")))$data)


print(R_getsites_time3)
##    user  system elapsed 
##   20.64    0.58  152.62
print(api_getsites_time3)
##    user  system elapsed 
##    0.60    0.00    3.97
print(length(api_sites3))
## [1] 3205
lats4 = c(23, 50, 50, 23)
lons4 = c(-90, -90, -60, -60) # Reordered for a rectangle

coordinates4 = data.frame(lat = lats4, lon = lons4)

coordinates4_sf = coordinates4 %>%
  st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
  summarise(geometry = st_combine(geometry)) %>%
  st_cast("POLYGON")

bbox_geojson4 = sf_geojson(coordinates4_sf)

Rsites4 = neotoma2::get_sites(loc = bbox_geojson4, all_data = TRUE)
## Warning in .f(.x[[i]], ...): Dataset(s) 25582, 25583, 6448 may have been recently removed from the database. Affected sites/datasets will be removed when you do `get_datasets` or `get_downloads`
R_getsites_time4 = system.time(neotoma2::get_sites(loc = bbox_geojson4, all_data = TRUE))
## Warning in .f(.x[[i]], ...): Dataset(s) 25582, 25583, 6448 may have been recently removed from the database. Affected sites/datasets will be removed when you do `get_datasets` or `get_downloads`
api_sites4 = content(GET(paste0("https://api.neotomadb.org/v2.0/data/sites?loc=",bbox_geojson4,"&limit=9999&offset=0")))$data

api_getsites_time4 = system.time(content(GET(paste0("https://api.neotomadb.org/v2.0/data/sites?loc=",bbox_geojson4,"&limit=99999&offset=0")))$data)



print(R_getsites_time4)
##    user  system elapsed 
##   40.42    1.27  431.39
print(api_getsites_time4)
##    user  system elapsed 
##    0.81    0.04    4.82
print(length(api_sites4))
## [1] 6231
tm_shape(osm.raster(coordinates4_sf)) + tm_rgb() +
  tm_shape(coordinates4_sf) + tm_borders(col="red") +
  tm_shape(coordinates3_sf) + tm_borders(col="blue") +
  tm_shape(coordinates2_sf) + tm_borders(col="black") +
  tm_shape(coordinates1_sf) + tm_borders(col="green") +
  tm_shape(coordinates_sf) + tm_borders(col="white")
## Zoom: 5

Rtimes = c(R_getsites_time[[3]],R_getsites_time1[[3]],R_getsites_time2[[3]],R_getsites_time3[[3]],R_getsites_time4[[3]])

apitimes = c(api_getsites_time[[3]],api_getsites_time1[[3]],api_getsites_time2[[3]],api_getsites_time3[[3]],api_getsites_time4[[3]])

site_num = c(length(api_sites),length(api_sites1),length(api_sites2),length(api_sites3),length(api_sites4))

time_df = data.frame(Rt = Rtimes,api_t = apitimes, sites = site_num)

ggplot(time_df) +
  geom_point(mapping=aes(x=sites,y=Rt),color="red",alpha=0.7) +
  geom_point(mapping=aes(x=sites,y=api_t),color="blue",alpha=0.7) +
  theme_bw() +
  scale_y_continuous(name="time (seconds)") +
  scale_x_continuous(name = "number of sites")

4 The JSON format

Web APIs return their responses in JSON (JavaScript Object Notation) format. JSON represents data as arrays of objects in which keys that define a property are assigned values. The value might be a number or string, or it could itself be an object or array of objects. In R, it is natural to represent these JSON arrays as nested lists. However, it is often easier to visualize an API response as a table rather than a list, which requires some looping.

head(api_sites4)
## [[1]]
## [[1]]$siteid
## [1] 7
## 
## [[1]]$sitename
## [1] "Three Pines Bog"
## 
## [[1]]$sitedescription
## [1] "Bog."
## 
## [[1]]$geography
## [1] "{\"type\":\"Point\",\"crs\":{\"type\":\"name\",\"properties\":{\"name\":\"EPSG:4326\"}},\"coordinates\":[-80.11667,47]}"
## 
## [[1]]$altitude
## [1] 329
## 
## [[1]]$collectionunits
## [[1]]$collectionunits[[1]]
## [[1]]$collectionunits[[1]]$handle
## [1] "3PINES"
## 
## [[1]]$collectionunits[[1]]$datasets
## [[1]]$collectionunits[[1]]$datasets[[1]]
## [[1]]$collectionunits[[1]]$datasets[[1]]$datasetid
## [1] 7
## 
## [[1]]$collectionunits[[1]]$datasets[[1]]$datasettype
## [1] "pollen"
## 
## 
## 
## [[1]]$collectionunits[[1]]$collectionunit
## NULL
## 
## [[1]]$collectionunits[[1]]$collectionunitid
## [1] 7
## 
## [[1]]$collectionunits[[1]]$collectionunittype
## [1] "Core"
## 
## 
## [[1]]$collectionunits[[2]]
## [[1]]$collectionunits[[2]]$handle
## [1] "3PINES"
## 
## [[1]]$collectionunits[[2]]$datasets
## [[1]]$collectionunits[[2]]$datasets[[1]]
## [[1]]$collectionunits[[2]]$datasets[[1]]$datasetid
## [1] 7857
## 
## [[1]]$collectionunits[[2]]$datasets[[1]]$datasettype
## [1] "geochronologic"
## 
## 
## 
## [[1]]$collectionunits[[2]]$collectionunit
## NULL
## 
## [[1]]$collectionunits[[2]]$collectionunitid
## [1] 7
## 
## [[1]]$collectionunits[[2]]$collectionunittype
## [1] "Core"
## 
## 
## 
## 
## [[2]]
## [[2]]$siteid
## [1] 10
## 
## [[2]]$sitename
## [1] "Site 1 (Cohen unpublished)"
## 
## [[2]]$sitedescription
## NULL
## 
## [[2]]$geography
## [1] "{\"type\":\"Point\",\"crs\":{\"type\":\"name\",\"properties\":{\"name\":\"EPSG:4326\"}},\"coordinates\":[-82.33,30.83]}"
## 
## [[2]]$altitude
## [1] 36
## 
## [[2]]$collectionunits
## [[2]]$collectionunits[[1]]
## [[2]]$collectionunits[[1]]$handle
## [1] "ADC001"
## 
## [[2]]$collectionunits[[1]]$datasets
## [[2]]$collectionunits[[1]]$datasets[[1]]
## [[2]]$collectionunits[[1]]$datasets[[1]]$datasetid
## [1] 10
## 
## [[2]]$collectionunits[[1]]$datasets[[1]]$datasettype
## [1] "pollen surface sample"
## 
## 
## 
## [[2]]$collectionunits[[1]]$collectionunit
## NULL
## 
## [[2]]$collectionunits[[1]]$collectionunitid
## [1] 10
## 
## [[2]]$collectionunits[[1]]$collectionunittype
## [1] "Modern"
## 
## 
## 
## 
## [[3]]
## [[3]]$siteid
## [1] 38
## 
## [[3]]$sitename
## [1] "Wardsg28"
## 
## [[3]]$sitedescription
## [1] "Physiography: driftless area."
## 
## [[3]]$geography
## [1] "{\"type\":\"Point\",\"crs\":{\"type\":\"name\",\"properties\":{\"name\":\"EPSG:4326\"}},\"coordinates\":[-89.93,42.3]}"
## 
## [[3]]$altitude
## [1] 272
## 
## [[3]]$collectionunits
## [[3]]$collectionunits[[1]]
## [[3]]$collectionunits[[1]]$handle
## [1] "AMD005"
## 
## [[3]]$collectionunits[[1]]$datasets
## [[3]]$collectionunits[[1]]$datasets[[1]]
## [[3]]$collectionunits[[1]]$datasets[[1]]$datasetid
## [1] 38
## 
## [[3]]$collectionunits[[1]]$datasets[[1]]$datasettype
## [1] "pollen surface sample"
## 
## 
## 
## [[3]]$collectionunits[[1]]$collectionunit
## NULL
## 
## [[3]]$collectionunits[[1]]$collectionunitid
## [1] 38
## 
## [[3]]$collectionunits[[1]]$collectionunittype
## [1] "Modern"
## 
## 
## 
## 
## [[4]]
## [[4]]$siteid
## [1] 43
## 
## [[4]]$sitename
## [1] "Wardsg17"
## 
## [[4]]$sitedescription
## [1] "Physiography: driftless area."
## 
## [[4]]$geography
## [1] "{\"type\":\"Point\",\"crs\":{\"type\":\"name\",\"properties\":{\"name\":\"EPSG:4326\"}},\"coordinates\":[-89.93,42.32]}"
## 
## [[4]]$altitude
## [1] 271
## 
## [[4]]$collectionunits
## [[4]]$collectionunits[[1]]
## [[4]]$collectionunits[[1]]$handle
## [1] "AMD010"
## 
## [[4]]$collectionunits[[1]]$datasets
## [[4]]$collectionunits[[1]]$datasets[[1]]
## [[4]]$collectionunits[[1]]$datasets[[1]]$datasetid
## [1] 43
## 
## [[4]]$collectionunits[[1]]$datasets[[1]]$datasettype
## [1] "pollen surface sample"
## 
## 
## 
## [[4]]$collectionunits[[1]]$collectionunit
## NULL
## 
## [[4]]$collectionunits[[1]]$collectionunitid
## [1] 43
## 
## [[4]]$collectionunits[[1]]$collectionunittype
## [1] "Modern"
## 
## 
## 
## 
## [[5]]
## [[5]]$siteid
## [1] 45
## 
## [[5]]$sitename
## [1] "Blanch Road"
## 
## [[5]]$sitedescription
## [1] "Physiography: driftless area."
## 
## [[5]]$geography
## [1] "{\"type\":\"Point\",\"crs\":{\"type\":\"name\",\"properties\":{\"name\":\"EPSG:4326\"}},\"coordinates\":[-89.87,42.77]}"
## 
## [[5]]$altitude
## [1] 272
## 
## [[5]]$collectionunits
## [[5]]$collectionunits[[1]]
## [[5]]$collectionunits[[1]]$handle
## [1] "AMD012"
## 
## [[5]]$collectionunits[[1]]$datasets
## [[5]]$collectionunits[[1]]$datasets[[1]]
## [[5]]$collectionunits[[1]]$datasets[[1]]$datasetid
## [1] 45
## 
## [[5]]$collectionunits[[1]]$datasets[[1]]$datasettype
## [1] "pollen surface sample"
## 
## 
## 
## [[5]]$collectionunits[[1]]$collectionunit
## NULL
## 
## [[5]]$collectionunits[[1]]$collectionunitid
## [1] 45
## 
## [[5]]$collectionunits[[1]]$collectionunittype
## [1] "Modern"
## 
## 
## 
## 
## [[6]]
## [[6]]$siteid
## [1] 56
## 
## [[6]]$sitename
## [1] "Hollndal"
## 
## [[6]]$sitedescription
## [1] "Physiography: driftless area."
## 
## [[6]]$geography
## [1] "{\"type\":\"Point\",\"crs\":{\"type\":\"name\",\"properties\":{\"name\":\"EPSG:4326\"}},\"coordinates\":[-89.88,42.88]}"
## 
## [[6]]$altitude
## [1] 287
## 
## [[6]]$collectionunits
## [[6]]$collectionunits[[1]]
## [[6]]$collectionunits[[1]]$handle
## [1] "AMD023"
## 
## [[6]]$collectionunits[[1]]$datasets
## [[6]]$collectionunits[[1]]$datasets[[1]]
## [[6]]$collectionunits[[1]]$datasets[[1]]$datasetid
## [1] 56
## 
## [[6]]$collectionunits[[1]]$datasets[[1]]$datasettype
## [1] "pollen surface sample"
## 
## 
## 
## [[6]]$collectionunits[[1]]$collectionunit
## NULL
## 
## [[6]]$collectionunits[[1]]$collectionunitid
## [1] 56
## 
## [[6]]$collectionunits[[1]]$collectionunittype
## [1] "Modern"
counter = 0
for (i in seq(length(api_sites4))) {
  for (j in seq(length(api_sites4[[i]]$collectionunits))) {
    counter = counter + 1
  }}

site_mat = matrix(nrow=counter,ncol=11)
idx = 0
for (i in seq(length(api_sites4))) {
  for (j in seq(length(api_sites4[[i]]$collectionunits))) {
    idx = idx + 1
    if(!is.null(api_sites4[[i]]$siteid)) {
    site_mat[[idx,1]] = api_sites4[[i]]$siteid}
      if(!is.null(api_sites4[[i]]$sitename)) {
    site_mat[[idx,2]] = api_sites4[[i]]$sitename}
      if(!is.null(api_sites4[[i]]$sitedescription)) {
    site_mat[[idx,3]] = api_sites4[[i]]$sitedescription}
      if(!is.null(api_sites4[[i]]$geography)) {
    site_mat[[idx,4]] = api_sites4[[i]]$geography}
      if(!is.null(api_sites4[[i]]$altitude)) {
    site_mat[[idx,5]] = api_sites4[[i]]$altitude}
      if(!is.null(api_sites4[[i]]$collectionunits[[j]]$handle)) {
    site_mat[[idx,6]] = api_sites4[[i]]$collectionunits[[j]]$handle}
      if(!is.null(api_sites4[[i]]$collectionunits[[j]]$collectionunit)) {
    site_mat[[idx,7]] = api_sites4[[i]]$collectionunits[[j]]$collectionunit}
      if(!is.null(api_sites4[[i]]$collectionunits[[j]]$collectionunitid)) {
    site_mat[[idx,8]] = api_sites4[[i]]$collectionunits[[j]]$collectionunitid}
      if(!is.null(api_sites4[[i]]$collectionunits[[j]]$collectionunittype)) {
    site_mat[[idx,9]] = api_sites4[[i]]$collectionunits[[j]]$collectionunittype}
      if(!is.null(api_sites4[[i]]$collectionunits[[j]]$datasets[[1]]$datasetid)) {
    site_mat[[idx,10]] = api_sites4[[i]]$collectionunits[[j]]$datasets[[1]]$datasetid}
      if(!is.null(api_sites4[[i]]$collectionunits[[j]]$datasets[[1]]$datasettype)) {
    site_mat[[idx,11]] = api_sites4[[i]]$collectionunits[[j]]$datasets[[1]]$datasettype}
    
  }
}

site_df = as.data.frame(site_mat)
names(site_df) = c("siteid","name","description","geography","altitude","handle","collectionunit","collectionunitid","collectionunittype","datasetid","datasettype")

datatable(site_df, rownames=FALSE)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

5 Download Tables to Approximate SQL

#thousandsites = site_df %>% dplyr::filter(!is.na(datasetid))
#dset_string=paste0(thousandsites$datasetid[1:1000],collapse=",")


#data_download = content(GET(paste0("https://api.neotomadb.org/v2.0/data/downloads/",dset_string,"&limit=9999")))$data